#糗事百科段子爬虫,糗事百科没了，爬百度贴吧！！！
#糗事百科的爬虫比较有意义的：模式修政符号来处理换行符！！！！！

#咱爬的是网址链接，而不是内容
import urllib.request
import re

keywd="从零开始的异世界生活"
keywd_code=urllib.request.quote(keywd)
headers=("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36")
opener=urllib.request.build_opener()
opener.addheaders=[headers]
for i in range(1,6):
    url="https://tieba.baidu.com/f?kw="+keywd_code+"&pn="+str((i-1)*50)
    data=urllib.request.urlopen(url).read().decode("UTF-8",'ignore')
    #print(data)
    pat='<a rel="noreferrer" href="(.*?)" title="'
    """
    pat2=target="_blank" class="j_th_tit ">——————爬取内容的正则表打式
    """
    rst=re.compile(pat).findall(data)
    for j in range(len(rst)):
        print("https://tieba.baidu.com"+rst[j])
        print("------------------------")

"""headers=("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36")
opener=urllib.request.build_opener()
opener.addheaders=[headers]
urllib.request.install_opener(opener)

for i in range(0,6):
    url="https://tieba.baidu.cohttps://tieba.baidu.comm/f?kw="+keywd_code+"&pn="+str(i*50)
    data=urllib.request.urlopen(url).read().decode('utf-8','ignore')
    pat=''
    #模式修正符！！！https://tieba.baidu.com
    
    
"""